import pandas as pd
import pandas.plotting
from pandas.plotting import scatter_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import squarify
from matplotlib import animation
from matplotlib.animation import FuncAnimation,FFMpegFileWriter
from mpl_toolkits.mplot3d import Axes3D
import os
base= r'D:\Navigation\Téléchargements\DataScienceFr\Machine learning sklearn'
base=base.replace('\\','/')
os.chdir(base)
pd.set_option('display.max_row',111)
pd.set_option('display.max_column',111)
Dataset=pd.read_excel('dataset.xlsx',index_col=0, encoding = "ISO-8859-1")
Dataset.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Serum Glucose | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Mycoplasma pneumoniae | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Neutrophils | Urea | Proteina C reativa mg/dL | Creatinine | Potassium | Sodium | Influenza B, rapid test | Influenza A, rapid test | Alanine transaminase | Aspartate transaminase | Gamma-glutamyltransferase | Total Bilirubin | Direct Bilirubin | Indirect Bilirubin | Alkaline phosphatase | Ionized calcium | Strepto A | Magnesium | pCO2 (venous blood gas analysis) | Hb saturation (venous blood gas analysis) | Base excess (venous blood gas analysis) | pO2 (venous blood gas analysis) | Fio2 (venous blood gas analysis) | Total CO2 (venous blood gas analysis) | pH (venous blood gas analysis) | HCO3 (venous blood gas analysis) | Rods # | Segmented | Promyelocytes | Metamyelocytes | Myelocytes | Myeloblasts | Urine - Esterase | Urine - Aspect | Urine - pH | Urine - Hemoglobin | Urine - Bile pigments | Urine - Ketone Bodies | Urine - Nitrite | Urine - Density | Urine - Urobilinogen | Urine - Protein | Urine - Sugar | Urine - Leukocytes | Urine - Crystals | Urine - Red blood cells | Urine - Hyaline cylinders | Urine - Granular cylinders | Urine - Yeasts | Urine - Color | Partial thromboplastin time (PTT) | Relationship (Patient/Normal) | International normalized ratio (INR) | Lactic Dehydrogenase | Prothrombin time (PT), Activity | Vitamin B12 | Creatine phosphokinase (CPK) | Ferritin | Arterial Lactic Acid | Lipase dosage | D-Dimer | Albumin | Hb saturation (arterial blood gases) | pCO2 (arterial blood gas analysis) | Base excess (arterial blood gas analysis) | pH (arterial blood gas analysis) | Total CO2 (arterial blood gas analysis) | HCO3 (arterial blood gas analysis) | pO2 (arterial blood gas analysis) | Arteiral Fio2 | Phosphor | ctO2 (arterial blood gas analysis) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Patient ID | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 44477f75e8169d2 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 126e9dd13932f68 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | -0.140648 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | -0.619086 | 1.198059 | -0.147895 | 2.089928 | -0.305787 | 0.862512 | negative | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| a46b4402a0e5696 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| f7d619a94f97c45 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| d9e41465789c2b5 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Relation Variables / Target :
target / blood : les taux de Monocytes, Platelets, Leukocytes semblent liés au covid-19 -> hypothese a tester.
target/age : les individus de faible age sont tres peu contaminés ? -> attention on ne connait pas l'age, et on ne sait pas de quand date le dataset (s'il s'agit des enfants on sait que les enfants sont touchés autant que les adultes). En revanche cette variable pourra etre intéressante pour la comparer avec les résultats de tests sanguins
target / viral : les doubles maladies sont tres rares. Rhinovirus/Enterovirus positif - covid-19 négatif ? -> hypothese a tester ? mais il est possible que la région est subie une épidémie de ce virus. De plus on peut tres bien avoir 2 virus en meme temps. Tout ca n'a aucun lien avec le covid-19.
Relation Variables / Variables :
Relation quanti * quanti
Relation quanti * quali
Relation quali * quali
Les individus atteints du covid-19 ont des taux de Leukocytes, Monocytes, Platelets significativement différents
Les individus atteints d'une quelconque maladie ont des taux significativement différents
df=Dataset.copy()
print(df.columns)
Index(['Patient age quantile', 'SARS-Cov-2 exam result',
'Patient addmited to regular ward (1=yes, 0=no)',
'Patient addmited to semi-intensive unit (1=yes, 0=no)',
'Patient addmited to intensive care unit (1=yes, 0=no)', 'Hematocrit',
'Hemoglobin', 'Platelets', 'Mean platelet volume ', 'Red blood Cells',
...
'Hb saturation (arterial blood gases)',
'pCO2 (arterial blood gas analysis)',
'Base excess (arterial blood gas analysis)',
'pH (arterial blood gas analysis)',
'Total CO2 (arterial blood gas analysis)',
'HCO3 (arterial blood gas analysis)',
'pO2 (arterial blood gas analysis)', 'Arteiral Fio2', 'Phosphor',
'ctO2 (arterial blood gas analysis)'],
dtype='object', length=110)
print(df.shape)
(5644, 110)
print(df.dtypes.value_counts())
df.dtypes.value_counts().plot.pie()
float64 70 object 36 int64 4 dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x1b28ee4ae08>
plt.figure(figsize=(20,10))
sns.heatmap(df.isna(), cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x2c14646df08>
(df.isna().sum()/df.shape[0]).sort_values(ascending=True)
Patient ID 0.000000 Patient age quantile 0.000000 SARS-Cov-2 exam result 0.000000 Patient addmited to regular ward (1=yes, 0=no) 0.000000 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.000000 Patient addmited to intensive care unit (1=yes, 0=no) 0.000000 Influenza B 0.760099 Respiratory Syncytial Virus 0.760099 Influenza A 0.760099 Rhinovirus/Enterovirus 0.760454 Inf A H1N1 2009 0.760454 CoronavirusOC43 0.760454 Coronavirus229E 0.760454 Parainfluenza 4 0.760454 Adenovirus 0.760454 Chlamydophila pneumoniae 0.760454 Parainfluenza 3 0.760454 Coronavirus HKU1 0.760454 CoronavirusNL63 0.760454 Parainfluenza 1 0.760454 Bordetella pertussis 0.760454 Parainfluenza 2 0.760454 Metapneumovirus 0.760454 Influenza A, rapid test 0.854713 Influenza B, rapid test 0.854713 Hemoglobin 0.893161 Hematocrit 0.893161 Red blood cell distribution width (RDW) 0.893338 Platelets 0.893338 Mean corpuscular volume (MCV) 0.893338 Eosinophils 0.893338 Mean corpuscular hemoglobin (MCH) 0.893338 Basophils 0.893338 Leukocytes 0.893338 Mean corpuscular hemoglobin concentration (MCHC) 0.893338 Lymphocytes 0.893338 Red blood Cells 0.893338 Monocytes 0.893515 Mean platelet volume 0.893870 Neutrophils 0.909107 Proteina C reativa mg/dL 0.910347 Creatinine 0.924876 Urea 0.929660 Potassium 0.934266 Sodium 0.934444 Strepto A 0.941176 Aspartate transaminase 0.959957 Alanine transaminase 0.960135 Serum Glucose 0.963147 Total Bilirubin 0.967753 Direct Bilirubin 0.967753 Indirect Bilirubin 0.967753 Gamma-glutamyltransferase 0.972892 Alkaline phosphatase 0.974486 HCO3 (venous blood gas analysis) 0.975904 pH (venous blood gas analysis) 0.975904 Total CO2 (venous blood gas analysis) 0.975904 Base excess (venous blood gas analysis) 0.975904 pO2 (venous blood gas analysis) 0.975904 pCO2 (venous blood gas analysis) 0.975904 Hb saturation (venous blood gas analysis) 0.975904 International normalized ratio (INR) 0.976435 Creatine phosphokinase (CPK) 0.981573 Lactic Dehydrogenase 0.982105 Myeloblasts 0.982814 Myelocytes 0.982814 Metamyelocytes 0.982814 Promyelocytes 0.982814 Rods # 0.982814 Segmented 0.982814 Relationship (Patient/Normal) 0.983877 Urine - Crystals 0.987597 Urine - Color 0.987597 Urine - Yeasts 0.987597 Urine - Red blood cells 0.987597 Urine - Leukocytes 0.987597 Urine - Density 0.987597 Urine - Bile pigments 0.987597 Urine - Hemoglobin 0.987597 Urine - pH 0.987597 Urine - Aspect 0.987597 Urine - Urobilinogen 0.987775 Urine - Granular cylinders 0.987775 Urine - Hyaline cylinders 0.988129 Urine - Protein 0.989369 Urine - Esterase 0.989369 Urine - Ketone Bodies 0.989901 Ionized calcium 0.991141 Magnesium 0.992913 ctO2 (arterial blood gas analysis) 0.995216 Hb saturation (arterial blood gases) 0.995216 pH (arterial blood gas analysis) 0.995216 Arterial Lactic Acid 0.995216 Total CO2 (arterial blood gas analysis) 0.995216 pCO2 (arterial blood gas analysis) 0.995216 HCO3 (arterial blood gas analysis) 0.995216 pO2 (arterial blood gas analysis) 0.995216 Base excess (arterial blood gas analysis) 0.995216 Ferritin 0.995925 Arteiral Fio2 0.996456 Phosphor 0.996456 Albumin 0.997697 Lipase dosage 0.998583 Vitamin B12 0.999468 Urine - Nitrite 0.999823 Fio2 (venous blood gas analysis) 0.999823 Partial thromboplastin time (PTT) 1.000000 Urine - Sugar 1.000000 Mycoplasma pneumoniae 1.000000 D-Dimer 1.000000 Prothrombin time (PT), Activity 1.000000 dtype: float64
df=df[df.columns[df.isna().sum()/df.shape[0]<0.9]]
plt.figure(figsize=(20,10))
sns.heatmap(df.isna(), cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1b290671548>
#df=df.drop('Patient ID',axis=1)
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Patient ID | ||||||||||||||||||||||||||||||||||||||
| 44477f75e8169d2 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 126e9dd13932f68 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative |
| a46b4402a0e5696 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| f7d619a94f97c45 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| d9e41465789c2b5 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN |
df['SARS-Cov-2 exam result'].value_counts()
negative 5086 positive 558 Name: SARS-Cov-2 exam result, dtype: int64
df['SARS-Cov-2 exam result'].value_counts(normalize=True)
negative 0.901134 positive 0.098866 Name: SARS-Cov-2 exam result, dtype: float64
for col in df.select_dtypes('float'):
#print(col)
plt.figure()
sns.distplot(df[col])
df.select_dtypes('float').columns
Index(['Hematocrit', 'Hemoglobin', 'Platelets', 'Mean platelet volume ',
'Red blood Cells', 'Lymphocytes',
'Mean corpuscular hemoglobin concentration (MCHC)', 'Leukocytes',
'Basophils', 'Mean corpuscular hemoglobin (MCH)', 'Eosinophils',
'Mean corpuscular volume (MCV)', 'Monocytes',
'Red blood cell distribution width (RDW)'],
dtype='object')
sns.distplot(df['Patient age quantile'])
<matplotlib.axes._subplots.AxesSubplot at 0x1b291601fc8>
df['Patient age quantile'].value_counts()
11 380 4 366 9 359 0 334 7 319 2 315 13 313 14 299 5 294 6 281 16 279 19 275 15 269 17 263 18 259 3 251 1 234 12 197 10 190 8 167 Name: Patient age quantile, dtype: int64
for col in df.select_dtypes('object'):
print(f'{col:-<50} {df[col].unique()}')
SARS-Cov-2 exam result---------------------------- ['negative' 'positive'] Respiratory Syncytial Virus----------------------- [nan 'not_detected' 'detected'] Influenza A--------------------------------------- [nan 'not_detected' 'detected'] Influenza B--------------------------------------- [nan 'not_detected' 'detected'] Parainfluenza 1----------------------------------- [nan 'not_detected' 'detected'] CoronavirusNL63----------------------------------- [nan 'not_detected' 'detected'] Rhinovirus/Enterovirus---------------------------- [nan 'detected' 'not_detected'] Coronavirus HKU1---------------------------------- [nan 'not_detected' 'detected'] Parainfluenza 3----------------------------------- [nan 'not_detected' 'detected'] Chlamydophila pneumoniae-------------------------- [nan 'not_detected' 'detected'] Adenovirus---------------------------------------- [nan 'not_detected' 'detected'] Parainfluenza 4----------------------------------- [nan 'not_detected' 'detected'] Coronavirus229E----------------------------------- [nan 'not_detected' 'detected'] CoronavirusOC43----------------------------------- [nan 'not_detected' 'detected'] Inf A H1N1 2009----------------------------------- [nan 'not_detected' 'detected'] Bordetella pertussis------------------------------ [nan 'not_detected' 'detected'] Metapneumovirus----------------------------------- [nan 'not_detected' 'detected'] Parainfluenza 2----------------------------------- [nan 'not_detected'] Influenza B, rapid test--------------------------- [nan 'negative' 'positive'] Influenza A, rapid test--------------------------- [nan 'negative' 'positive']
for col in df.select_dtypes('object'):
plt.figure()
df[col].value_counts().plot.pie()
df_object.head()
| SARS-Cov-2 exam result | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | statut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | inconnu |
| 1 | negative | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative | inconnu |
| 2 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | inconnu |
| 3 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | inconnu |
| 4 | negative | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | inconnu |
positive_df=df[df['SARS-Cov-2 exam result']=='positive']
negative_df=df[df['SARS-Cov-2 exam result']=='negative']
missing_rate=df.isna().sum()/df.shape[0]
blood_columns=df.columns[(missing_rate < 0.9) & (missing_rate > 0.88)]
viral_columns=df.columns[(missing_rate < 0.88) & (missing_rate > 0.75)]
for col in blood_columns:
plt.figure()
sns.distplot(positive_df[col], label='positive')
sns.distplot(negative_df[col], label='negative')
plt.legend()
sns.countplot(x='Patient age quantile',hue='SARS-Cov-2 exam result', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x2c15381d2c8>
pd.crosstab(df['SARS-Cov-2 exam result'],df['Influenza A'])
| Influenza A | detected | not_detected |
|---|---|---|
| SARS-Cov-2 exam result | ||
| negative | 18 | 1224 |
| positive | 0 | 112 |
for col in viral_columns:
plt.figure()
sns.heatmap(pd.crosstab(df['SARS-Cov-2 exam result'],df[col]),annot=True,fmt='d')
import researchpy as rp
corr_type, corr_matrix, corr_ps = rp.corr_case(df[blood_columns])
print(corr_type)
Pearson correlation test using list-wise deletion 0 Total observations used = 598
#les correlations entre variables
corr_matrix
| Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Hematocrit | 1 | 0.969 | -0.0889 | 0.0836 | 0.8907 | 0.0026 | 0.1286 | -0.0945 | 0.1291 | 0.0698 | 0.0274 | 0.0173 | 0.0847 | -0.2631 |
| Hemoglobin | 0.969 | 1 | -0.1288 | 0.0789 | 0.8654 | -0.0034 | 0.3677 | -0.1095 | 0.1163 | 0.1749 | 0.0152 | 0.0097 | 0.1003 | -0.3377 |
| Platelets | -0.0889 | -0.1288 | 1 | -0.3568 | -0.0461 | 0.0865 | -0.1704 | 0.4394 | -0.0226 | -0.1315 | 0.1681 | -0.0595 | -0.1997 | 0.0201 |
| Mean platelet volume | 0.0836 | 0.0789 | -0.3568 | 1 | 0.0434 | 0.0791 | -0.004 | -0.1547 | 0.1288 | 0.07 | -0.0466 | 0.0786 | 0.0384 | 0.0455 |
| Red blood Cells | 0.8907 | 0.8654 | -0.0461 | 0.0434 | 1 | -0.0085 | 0.1159 | -0.0246 | 0.0802 | -0.3361 | 0.0013 | -0.4342 | 0.0387 | -0.184 |
| Lymphocytes | 0.0026 | -0.0034 | 0.0865 | 0.0791 | -0.0085 | 1 | -0.0274 | -0.3333 | 0.238 | 0.0157 | 0.2011 | 0.0279 | 0.066 | -0.0799 |
| Mean corpuscular hemoglobin concentration (MCHC) | 0.1286 | 0.3677 | -0.1704 | -0.004 | 0.1159 | -0.0274 | 1 | -0.0779 | -0.0266 | 0.4589 | -0.0478 | -0.0067 | 0.0843 | -0.3826 |
| Leukocytes | -0.0945 | -0.1095 | 0.4394 | -0.1547 | -0.0246 | -0.3333 | -0.0779 | 1 | -0.3056 | -0.1588 | -0.0964 | -0.1345 | -0.2935 | 0.1569 |
| Basophils | 0.1291 | 0.1163 | -0.0226 | 0.1288 | 0.0802 | 0.238 | -0.0266 | -0.3056 | 1 | 0.067 | 0.3346 | 0.0885 | 0.0976 | 0.0392 |
| Mean corpuscular hemoglobin (MCH) | 0.0698 | 0.1749 | -0.1315 | 0.07 | -0.3361 | 0.0157 | 0.4589 | -0.1588 | 0.067 | 1 | 0.0173 | 0.8841 | 0.1154 | -0.2456 |
| Eosinophils | 0.0274 | 0.0152 | 0.1681 | -0.0466 | 0.0013 | 0.2011 | -0.0478 | -0.0964 | 0.3346 | 0.0173 | 1 | 0.0426 | 0.01 | 0.0041 |
| Mean corpuscular volume (MCV) | 0.0173 | 0.0097 | -0.0595 | 0.0786 | -0.4342 | 0.0279 | -0.0067 | -0.1345 | 0.0885 | 0.8841 | 0.0426 | 1 | 0.0841 | -0.0848 |
| Monocytes | 0.0847 | 0.1003 | -0.1997 | 0.0384 | 0.0387 | 0.066 | 0.0843 | -0.2935 | 0.0976 | 0.1154 | 0.01 | 0.0841 | 1 | -0.0255 |
| Red blood cell distribution width (RDW) | -0.2631 | -0.3377 | 0.0201 | 0.0455 | -0.184 | -0.0799 | -0.3826 | 0.1569 | 0.0392 | -0.2456 | 0.0041 | -0.0848 | -0.0255 | 1 |
#les pvalues des correlations entre variables
corr_ps
| Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Hematocrit | 0.0000 | 0.0000 | 0.0298 | 0.0410 | 0.0000 | 0.9499 | 0.0016 | 0.0208 | 0.0016 | 0.0882 | 0.5039 | 0.6728 | 0.0383 | 0.0000 |
| Hemoglobin | 0.0000 | 0.0000 | 0.0016 | 0.0539 | 0.0000 | 0.9338 | 0.0000 | 0.0074 | 0.0044 | 0.0000 | 0.7108 | 0.8133 | 0.0142 | 0.0000 |
| Platelets | 0.0298 | 0.0016 | 0.0000 | 0.0000 | 0.2600 | 0.0345 | 0.0000 | 0.0000 | 0.5808 | 0.0013 | 0.0000 | 0.1463 | 0.0000 | 0.6238 |
| Mean platelet volume | 0.0410 | 0.0539 | 0.0000 | 0.0000 | 0.2890 | 0.0531 | 0.9224 | 0.0001 | 0.0016 | 0.0870 | 0.2557 | 0.0549 | 0.3488 | 0.2664 |
| Red blood Cells | 0.0000 | 0.0000 | 0.2600 | 0.2890 | 0.0000 | 0.8348 | 0.0046 | 0.5476 | 0.0498 | 0.0000 | 0.9753 | 0.0000 | 0.3452 | 0.0000 |
| Lymphocytes | 0.9499 | 0.9338 | 0.0345 | 0.0531 | 0.8348 | 0.0000 | 0.5033 | 0.0000 | 0.0000 | 0.7015 | 0.0000 | 0.4966 | 0.1069 | 0.0508 |
| Mean corpuscular hemoglobin concentration (MCHC) | 0.0016 | 0.0000 | 0.0000 | 0.9224 | 0.0046 | 0.5033 | 0.0000 | 0.0568 | 0.5161 | 0.0000 | 0.2435 | 0.8695 | 0.0394 | 0.0000 |
| Leukocytes | 0.0208 | 0.0074 | 0.0000 | 0.0001 | 0.5476 | 0.0000 | 0.0568 | 0.0000 | 0.0000 | 0.0001 | 0.0183 | 0.0010 | 0.0000 | 0.0001 |
| Basophils | 0.0016 | 0.0044 | 0.5808 | 0.0016 | 0.0498 | 0.0000 | 0.5161 | 0.0000 | 0.0000 | 0.1015 | 0.0000 | 0.0304 | 0.0169 | 0.3382 |
| Mean corpuscular hemoglobin (MCH) | 0.0882 | 0.0000 | 0.0013 | 0.0870 | 0.0000 | 0.7015 | 0.0000 | 0.0001 | 0.1015 | 0.0000 | 0.6728 | 0.0000 | 0.0047 | 0.0000 |
| Eosinophils | 0.5039 | 0.7108 | 0.0000 | 0.2557 | 0.9753 | 0.0000 | 0.2435 | 0.0183 | 0.0000 | 0.6728 | 0.0000 | 0.2979 | 0.8081 | 0.9196 |
| Mean corpuscular volume (MCV) | 0.6728 | 0.8133 | 0.1463 | 0.0549 | 0.0000 | 0.4966 | 0.8695 | 0.0010 | 0.0304 | 0.0000 | 0.2979 | 0.0000 | 0.0397 | 0.0382 |
| Monocytes | 0.0383 | 0.0142 | 0.0000 | 0.3488 | 0.3452 | 0.1069 | 0.0394 | 0.0000 | 0.0169 | 0.0047 | 0.8081 | 0.0397 | 0.0000 | 0.5332 |
| Red blood cell distribution width (RDW) | 0.0000 | 0.0000 | 0.6238 | 0.2664 | 0.0000 | 0.0508 | 0.0000 | 0.0001 | 0.3382 | 0.0000 | 0.9196 | 0.0382 | 0.5332 | 0.0000 |
sns.pairplot(df[blood_columns])
<seaborn.axisgrid.PairGrid at 0x2c15336ee88>
sns.heatmap(df[blood_columns].corr())
<matplotlib.axes._subplots.AxesSubplot at 0x2c158f27808>
corr = df[blood_columns].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1, annot=True, annot_kws={"size": 8}, square=True);
sns.clustermap(df[blood_columns].corr())
<seaborn.matrix.ClusterGrid at 0x2c158e97fc8>
for col in blood_columns:
plt.figure()
sns.lmplot(x='Patient age quantile', y=col, hue='SARS-Cov-2 exam result', data=df)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:318: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig, axes = plt.subplots(nrow, ncol, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:318: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig, axes = plt.subplots(nrow, ncol, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:318: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig, axes = plt.subplots(nrow, ncol, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:318: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig, axes = plt.subplots(nrow, ncol, **kwargs)
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
df.corr()['Patient age quantile'].sort_values()
Leukocytes -0.166386 Platelets -0.158683 Lymphocytes -0.125935 Mean corpuscular hemoglobin concentration (MCHC) -0.124671 Red blood Cells -0.037510 Patient addmited to intensive care unit (1=yes, 0=no) -0.035772 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.015736 Eosinophils 0.022085 Patient addmited to regular ward (1=yes, 0=no) 0.046166 Monocytes 0.050962 Hemoglobin 0.060320 Hematocrit 0.096808 Basophils 0.107525 Mean platelet volume 0.119449 Red blood cell distribution width (RDW) 0.166429 Mean corpuscular hemoglobin (MCH) 0.197394 Mean corpuscular volume (MCV) 0.281655 Patient age quantile 1.000000 Name: Patient age quantile, dtype: float64
pd.crosstab(df['Influenza A'], df['Influenza A, rapid test'])
| Influenza A, rapid test | negative | positive |
|---|---|---|
| Influenza A | ||
| detected | 2 | 4 |
| not_detected | 245 | 15 |
pd.crosstab(df['Influenza B'], df['Influenza B, rapid test'])
| Influenza B, rapid test | negative | positive |
|---|---|---|
| Influenza B | ||
| detected | 18 | 11 |
| not_detected | 233 | 4 |
df['est malade'] = np.sum(df[viral_columns[:-2]] == 'detected', axis=1) >=1
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | est malade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative | True |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | True |
malade_df = df[df['est malade'] == True]
non_malade_df = df[df['est malade'] == False]
for col in blood_columns:
plt.figure()
sns.distplot(malade_df[col], label='malade')
sns.distplot(non_malade_df[col], label='non malade')
plt.legend()
def hospitalisation(df):
if df['Patient addmited to regular ward (1=yes, 0=no)'] == 1:
return 'surveillance'
elif df['Patient addmited to semi-intensive unit (1=yes, 0=no)'] == 1:
return 'soins semi-intensives'
elif df['Patient addmited to intensive care unit (1=yes, 0=no)'] == 1:
return 'soins intensifs'
else:
return 'inconnu'
df['statut'] = df.apply(hospitalisation, axis=1)
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | est malade | statut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative | True | inconnu |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | True | inconnu |
for col in blood_columns:
plt.figure()
for cat in df['statut'].unique():
sns.distplot(df[df['statut']==cat][col], label=cat)
plt.legend()
df[blood_columns].count()
Hematocrit 603 Hemoglobin 603 Platelets 602 Mean platelet volume 599 Red blood Cells 602 Lymphocytes 602 Mean corpuscular hemoglobin concentration (MCHC) 602 Leukocytes 602 Basophils 602 Mean corpuscular hemoglobin (MCH) 602 Eosinophils 602 Mean corpuscular volume (MCV) 602 Monocytes 601 Red blood cell distribution width (RDW) 602 dtype: int64
df[viral_columns].count()
Respiratory Syncytial Virus 1354 Influenza A 1354 Influenza B 1354 Parainfluenza 1 1352 CoronavirusNL63 1352 Rhinovirus/Enterovirus 1352 Coronavirus HKU1 1352 Parainfluenza 3 1352 Chlamydophila pneumoniae 1352 Adenovirus 1352 Parainfluenza 4 1352 Coronavirus229E 1352 CoronavirusOC43 1352 Inf A H1N1 2009 1352 Bordetella pertussis 1352 Metapneumovirus 1352 Parainfluenza 2 1352 Influenza B, rapid test 820 Influenza A, rapid test 820 dtype: int64
df1 = df[viral_columns[:-2]]
df1['covid'] = df['SARS-Cov-2 exam result']
df1.dropna()['covid'].value_counts(normalize=True)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
negative 0.91716 positive 0.08284 Name: covid, dtype: float64
df2 = df[blood_columns]
df2['covid'] = df['SARS-Cov-2 exam result']
df2.dropna()['covid'].value_counts(normalize=True)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
negative 0.864548 positive 0.135452 Name: covid, dtype: float64
from scipy.stats import ttest_ind
positive_df.shape
(558, 38)
negative_df.shape
(5086, 38)
balanced_neg = negative_df.sample(positive_df.shape[0])
def t_test(col):
alpha = 0.05
stat, p = ttest_ind(balanced_neg[col].dropna(), positive_df[col].dropna())
if p < alpha:
return 'H0 Rejetée -- test significatif au niveau 5%'
else :
return 'Non H0 Rejetée -- test non significatif au niveau 5%'
for col in blood_columns:
print(f'{col :-<50} {t_test(col)}')
Hematocrit---------------------------------------- H0 Rejetée -- test significatif au niveau 5% Hemoglobin---------------------------------------- H0 Rejetée -- test significatif au niveau 5% Platelets----------------------------------------- H0 Rejetée -- test significatif au niveau 5% Mean platelet volume ----------------------------- Non H0 Rejetée -- test non significatif au niveau 5% Red blood Cells----------------------------------- H0 Rejetée -- test significatif au niveau 5% Lymphocytes--------------------------------------- Non H0 Rejetée -- test non significatif au niveau 5% Mean corpuscular hemoglobin concentration (MCHC)-- Non H0 Rejetée -- test non significatif au niveau 5% Leukocytes---------------------------------------- H0 Rejetée -- test significatif au niveau 5% Basophils----------------------------------------- Non H0 Rejetée -- test non significatif au niveau 5% Mean corpuscular hemoglobin (MCH)----------------- Non H0 Rejetée -- test non significatif au niveau 5% Eosinophils--------------------------------------- H0 Rejetée -- test significatif au niveau 5% Mean corpuscular volume (MCV)--------------------- Non H0 Rejetée -- test non significatif au niveau 5% Monocytes----------------------------------------- H0 Rejetée -- test significatif au niveau 5% Red blood cell distribution width (RDW)----------- H0 Rejetée -- test significatif au niveau 5%
df = Dataset.copy()
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Serum Glucose | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Mycoplasma pneumoniae | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Neutrophils | Urea | Proteina C reativa mg/dL | Creatinine | Potassium | Sodium | Influenza B, rapid test | Influenza A, rapid test | Alanine transaminase | Aspartate transaminase | Gamma-glutamyltransferase | Total Bilirubin | Direct Bilirubin | Indirect Bilirubin | Alkaline phosphatase | Ionized calcium | Strepto A | Magnesium | pCO2 (venous blood gas analysis) | Hb saturation (venous blood gas analysis) | Base excess (venous blood gas analysis) | pO2 (venous blood gas analysis) | Fio2 (venous blood gas analysis) | Total CO2 (venous blood gas analysis) | pH (venous blood gas analysis) | HCO3 (venous blood gas analysis) | Rods # | Segmented | Promyelocytes | Metamyelocytes | Myelocytes | Myeloblasts | Urine - Esterase | Urine - Aspect | Urine - pH | Urine - Hemoglobin | Urine - Bile pigments | Urine - Ketone Bodies | Urine - Nitrite | Urine - Density | Urine - Urobilinogen | Urine - Protein | Urine - Sugar | Urine - Leukocytes | Urine - Crystals | Urine - Red blood cells | Urine - Hyaline cylinders | Urine - Granular cylinders | Urine - Yeasts | Urine - Color | Partial thromboplastin time (PTT) | Relationship (Patient/Normal) | International normalized ratio (INR) | Lactic Dehydrogenase | Prothrombin time (PT), Activity | Vitamin B12 | Creatine phosphokinase (CPK) | Ferritin | Arterial Lactic Acid | Lipase dosage | D-Dimer | Albumin | Hb saturation (arterial blood gases) | pCO2 (arterial blood gas analysis) | Base excess (arterial blood gas analysis) | pH (arterial blood gas analysis) | Total CO2 (arterial blood gas analysis) | HCO3 (arterial blood gas analysis) | pO2 (arterial blood gas analysis) | Arteiral Fio2 | Phosphor | ctO2 (arterial blood gas analysis) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Patient ID | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 44477f75e8169d2 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 126e9dd13932f68 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | -0.140648 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | -0.619086 | 1.198059 | -0.147895 | 2.089928 | -0.305787 | 0.862512 | negative | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| a46b4402a0e5696 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| f7d619a94f97c45 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| d9e41465789c2b5 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
missing_rate = df.isna().sum()/df.shape[0]
blood_columns = list(df.columns[(missing_rate < 0.9) & (missing_rate >0.88)])
viral_columns = list(df.columns[(missing_rate < 0.80) & (missing_rate > 0.75)])
key_columns = ['Patient age quantile', 'SARS-Cov-2 exam result']
df = df[key_columns + blood_columns + viral_columns]
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Patient ID | |||||||||||||||||||||||||||||||||
| 44477f75e8169d2 | 13 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 126e9dd13932f68 | 17 | negative | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected |
| a46b4402a0e5696 | 8 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| f7d619a94f97c45 | 5 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| d9e41465789c2b5 | 15 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected |
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)
trainset['SARS-Cov-2 exam result'].value_counts()
negative 4068 positive 447 Name: SARS-Cov-2 exam result, dtype: int64
testset['SARS-Cov-2 exam result'].value_counts()
negative 1018 positive 111 Name: SARS-Cov-2 exam result, dtype: int64
def encodage(df):
code = {'negative':0,
'positive':1,
'not_detected':0,
'detected':1}
for col in df.select_dtypes('object').columns:
df.loc[:,col] = df[col].map(code)
return df
def feature_engineering(df):
df['est malade'] = df[viral_columns].sum(axis=1) >= 1
df = df.drop(viral_columns, axis=1)
return df
def imputation(df):
#df['is na'] = (df['Parainfluenza 3'].isna()) | (df['Leukocytes'].isna())
#df = df.fillna(-999)
df = df.dropna(axis=0)
return df
def preprocessing(df):
df = encodage(df)
df = feature_engineering(df)
df = imputation(df)
X = df.drop('SARS-Cov-2 exam result', axis=1)
y = df['SARS-Cov-2 exam result']
print(y.value_counts())
return X, y
X_train, y_train = preprocessing(trainset)
0 422 1 65 Name: SARS-Cov-2 exam result, dtype: int64
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:966: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[item] = s C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_test, y_test = preprocessing(testset)
0 95 1 16 Name: SARS-Cov-2 exam result, dtype: int64
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.decomposition import PCA
#Modelisation de base pour diagnostiquer
model_1 = DecisionTreeClassifier(random_state=0)
model_2 = RandomForestClassifier(random_state=0)
model_3 = make_pipeline(PolynomialFeatures(2,include_bias=False),SelectKBest(f_classif, k=10),RandomForestClassifier(random_state=0))
#Transformation des données et selection des variables plus significatifs
preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))
#Modelisation d'optimisation
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())
dict_of_models = {'RandomForest': RandomForest,
'AdaBoost' : AdaBoost,
'SVM': SVM,
'KNN': KNN
}
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
def evaluation(model):
model.fit(X_train, y_train)
ypred = model.predict(X_test)
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))
N, train_score, val_score = learning_curve(model, X_train, y_train,
cv=4, scoring='f1',
train_sizes=np.linspace(0.1, 1, 10))
plt.figure(figsize=(12, 8))
plt.plot(N, train_score.mean(axis=1), label='train score')
plt.plot(N, val_score.mean(axis=1), label='validation score')
plt.legend()
#DecisionTreeClassifier est un arbre de decision presente un overfutting
evaluation(model_1)
[[87 8]
[10 6]]
precision recall f1-score support
0 0.90 0.92 0.91 95
1 0.43 0.38 0.40 16
accuracy 0.84 111
macro avg 0.66 0.65 0.65 111
weighted avg 0.83 0.84 0.83 111
pd.DataFrame(model_1.feature_importances_, index=X_train.columns).plot.bar(figsize=(12, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1b290587848>
evaluation(model_2)
[[92 3]
[13 3]]
precision recall f1-score support
0 0.88 0.97 0.92 95
1 0.50 0.19 0.27 16
accuracy 0.86 111
macro avg 0.69 0.58 0.60 111
weighted avg 0.82 0.86 0.83 111
pd.DataFrame(model_2.feature_importances_, index=X_train.columns).plot.bar(figsize=(12, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1b294439a88>
evaluation(model_3)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw
[[91 4]
[11 5]]
precision recall f1-score support
0 0.89 0.96 0.92 95
1 0.56 0.31 0.40 16
accuracy 0.86 111
macro avg 0.72 0.64 0.66 111
weighted avg 0.84 0.86 0.85 111
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:114: UserWarning: Features [0] are constant. UserWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide f = msb / msw
for name, model in dict_of_models.items():
print(name)
evaluation(model)
RandomForest
[[91 4]
[11 5]]
precision recall f1-score support
0 0.89 0.96 0.92 95
1 0.56 0.31 0.40 16
accuracy 0.86 111
macro avg 0.72 0.64 0.66 111
weighted avg 0.84 0.86 0.85 111
AdaBoost
[[91 4]
[ 9 7]]
precision recall f1-score support
0 0.91 0.96 0.93 95
1 0.64 0.44 0.52 16
accuracy 0.88 111
macro avg 0.77 0.70 0.73 111
weighted avg 0.87 0.88 0.87 111
SVM
[[92 3]
[10 6]]
precision recall f1-score support
0 0.90 0.97 0.93 95
1 0.67 0.38 0.48 16
accuracy 0.88 111
macro avg 0.78 0.67 0.71 111
weighted avg 0.87 0.88 0.87 111
KNN
[[88 7]
[ 8 8]]
precision recall f1-score support
0 0.92 0.93 0.92 95
1 0.53 0.50 0.52 16
accuracy 0.86 111
macro avg 0.72 0.71 0.72 111
weighted avg 0.86 0.86 0.86 111
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
SVM
Pipeline(memory=None,
steps=[('pipeline',
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=2,
include_bias=False,
interaction_only=False,
order='C')),
('selectkbest',
SelectKBest(k=10,
score_func=<function f_classif at 0x000001B294563DC8>))],
verbose=False)),
('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('svc',
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
coef0=0.0, decision_function_shape='ovr', degree=3,
gamma='scale', kernel='rbf', max_iter=-1,
probability=False, random_state=0, shrinking=True,
tol=0.001, verbose=False))],
verbose=False)
hyper_params = {'svc__gamma':[1e-3, 1e-4, 0.0005],
'svc__C':[1, 10, 100, 1000, 3000],
'pipeline__polynomialfeatures__degree':[2, 3],
'pipeline__selectkbest__k': range(45, 60)}
grid = RandomizedSearchCV(SVM, hyper_params, scoring='recall', cv=4, n_iter=40)
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))
{'svc__gamma': 0.0001, 'svc__C': 3000, 'pipeline__selectkbest__k': 49, 'pipeline__polynomialfeatures__degree': 3}
precision recall f1-score support
0 0.92 0.94 0.93 95
1 0.57 0.50 0.53 16
accuracy 0.87 111
macro avg 0.74 0.72 0.73 111
weighted avg 0.87 0.87 0.87 111
evaluation(grid.best_estimator_)
[[89 6]
[ 8 8]]
precision recall f1-score support
0 0.92 0.94 0.93 95
1 0.57 0.50 0.53 16
accuracy 0.87 111
macro avg 0.74 0.72 0.73 111
weighted avg 0.87 0.87 0.87 111
from sklearn.metrics import precision_recall_curve
precision, recall, threshold = precision_recall_curve(y_test, grid.best_estimator_.decision_function(X_test))
plt.plot(threshold, precision[:-1], label='precision')
plt.plot(threshold, recall[:-1], label='recall')
plt.legend()
<matplotlib.legend.Legend at 0x1b299991bc8>
def model_final(model, X, threshold=0):
return model.decision_function(X) > threshold
y_pred = model_final(grid.best_estimator_, X_test, threshold=-1)
from sklearn.metrics import recall_score
f1_score(y_test, y_pred)
0.5238095238095238
recall_score(y_test, y_pred)
0.6875
24/08/2021